{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "# Simple Reinforcement Learning with Tensorflow: Part 3 - Model-Based RL\n",
    "In this iPython notebook we implement a policy and model network which work in tandem to solve the CartPole reinforcement learning problem. To learn more, read here: https://medium.com/p/9a6fe0cce99\n",
    "\n",
    "For more reinforcment learning tutorials, see:\n",
    "https://github.com/awjuliani/DeepRL-Agents"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "### Loading libraries and starting CartPole environment"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "from __future__ import print_function\n",
    "import numpy as np\n",
    "try:\n",
    "    import cPickle as pickle\n",
    "except ModuleNotFoundError:\n",
    "    import pickle\n",
    "import tensorflow as tf\n",
    "%matplotlib inline\n",
    "import matplotlib.pyplot as plt\n",
    "import math"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import sys\n",
    "if sys.version_info.major > 2:\n",
    "    xrange = range\n",
    "del sys"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "import gym\n",
    "env = gym.make('CartPole-v0')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "### Setting Hyper-parameters"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "# hyperparameters\n",
    "H = 8 # number of hidden layer neurons\n",
    "learning_rate = 1e-2\n",
    "gamma = 0.99 # discount factor for reward\n",
    "decay_rate = 0.99 # decay factor for RMSProp leaky sum of grad^2\n",
    "resume = False # resume from previous checkpoint?\n",
    "\n",
    "model_bs = 3 # Batch size when learning from model\n",
    "real_bs = 3 # Batch size when learning from real environment\n",
    "\n",
    "# model initialization\n",
    "D = 4 # input dimensionality"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "### Policy Network"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "tf.reset_default_graph()\n",
    "observations = tf.placeholder(tf.float32, [None,4] , name=\"input_x\")\n",
    "W1 = tf.get_variable(\"W1\", shape=[4, H],\n",
    "           initializer=tf.contrib.layers.xavier_initializer())\n",
    "layer1 = tf.nn.relu(tf.matmul(observations,W1))\n",
    "W2 = tf.get_variable(\"W2\", shape=[H, 1],\n",
    "           initializer=tf.contrib.layers.xavier_initializer())\n",
    "score = tf.matmul(layer1,W2)\n",
    "probability = tf.nn.sigmoid(score)\n",
    "\n",
    "tvars = tf.trainable_variables()\n",
    "input_y = tf.placeholder(tf.float32,[None,1], name=\"input_y\")\n",
    "advantages = tf.placeholder(tf.float32,name=\"reward_signal\")\n",
    "adam = tf.train.AdamOptimizer(learning_rate=learning_rate)\n",
    "W1Grad = tf.placeholder(tf.float32,name=\"batch_grad1\")\n",
    "W2Grad = tf.placeholder(tf.float32,name=\"batch_grad2\")\n",
    "batchGrad = [W1Grad,W2Grad]\n",
    "loglik = tf.log(input_y*(input_y - probability) + (1 - input_y)*(input_y + probability))\n",
    "loss = -tf.reduce_mean(loglik * advantages) \n",
    "newGrads = tf.gradients(loss,tvars)\n",
    "updateGrads = adam.apply_gradients(zip(batchGrad,tvars))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "### Model Network\n",
    "Here we implement a multi-layer neural network that predicts the next observation, reward, and done state from a current state and action."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "mH = 256 # model layer size\n",
    "\n",
    "input_data = tf.placeholder(tf.float32, [None, 5])\n",
    "with tf.variable_scope('rnnlm'):\n",
    "    softmax_w = tf.get_variable(\"softmax_w\", [mH, 50])\n",
    "    softmax_b = tf.get_variable(\"softmax_b\", [50])\n",
    "\n",
    "previous_state = tf.placeholder(tf.float32, [None,5] , name=\"previous_state\")\n",
    "W1M = tf.get_variable(\"W1M\", shape=[5, mH],\n",
    "           initializer=tf.contrib.layers.xavier_initializer())\n",
    "B1M = tf.Variable(tf.zeros([mH]),name=\"B1M\")\n",
    "layer1M = tf.nn.relu(tf.matmul(previous_state,W1M) + B1M)\n",
    "W2M = tf.get_variable(\"W2M\", shape=[mH, mH],\n",
    "           initializer=tf.contrib.layers.xavier_initializer())\n",
    "B2M = tf.Variable(tf.zeros([mH]),name=\"B2M\")\n",
    "layer2M = tf.nn.relu(tf.matmul(layer1M,W2M) + B2M)\n",
    "wO = tf.get_variable(\"wO\", shape=[mH, 4],\n",
    "           initializer=tf.contrib.layers.xavier_initializer())\n",
    "wR = tf.get_variable(\"wR\", shape=[mH, 1],\n",
    "           initializer=tf.contrib.layers.xavier_initializer())\n",
    "wD = tf.get_variable(\"wD\", shape=[mH, 1],\n",
    "           initializer=tf.contrib.layers.xavier_initializer())\n",
    "\n",
    "bO = tf.Variable(tf.zeros([4]),name=\"bO\")\n",
    "bR = tf.Variable(tf.zeros([1]),name=\"bR\")\n",
    "bD = tf.Variable(tf.ones([1]),name=\"bD\")\n",
    "\n",
    "\n",
    "predicted_observation = tf.matmul(layer2M,wO,name=\"predicted_observation\") + bO\n",
    "predicted_reward = tf.matmul(layer2M,wR,name=\"predicted_reward\") + bR\n",
    "predicted_done = tf.sigmoid(tf.matmul(layer2M,wD,name=\"predicted_done\") + bD)\n",
    "\n",
    "true_observation = tf.placeholder(tf.float32,[None,4],name=\"true_observation\")\n",
    "true_reward = tf.placeholder(tf.float32,[None,1],name=\"true_reward\")\n",
    "true_done = tf.placeholder(tf.float32,[None,1],name=\"true_done\")\n",
    "\n",
    "\n",
    "predicted_state = tf.concat([predicted_observation,predicted_reward,predicted_done],1)\n",
    "\n",
    "observation_loss = tf.square(true_observation - predicted_observation)\n",
    "\n",
    "reward_loss = tf.square(true_reward - predicted_reward)\n",
    "\n",
    "done_loss = tf.multiply(predicted_done, true_done) + tf.multiply(1-predicted_done, 1-true_done)\n",
    "done_loss = -tf.log(done_loss)\n",
    "\n",
    "model_loss = tf.reduce_mean(observation_loss + done_loss + reward_loss)\n",
    "\n",
    "modelAdam = tf.train.AdamOptimizer(learning_rate=learning_rate)\n",
    "updateModel = modelAdam.minimize(model_loss)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "### Helper-functions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "def resetGradBuffer(gradBuffer):\n",
    "    for ix,grad in enumerate(gradBuffer):\n",
    "        gradBuffer[ix] = grad * 0\n",
    "    return gradBuffer\n",
    "        \n",
    "def discount_rewards(r):\n",
    "    \"\"\" take 1D float array of rewards and compute discounted reward \"\"\"\n",
    "    discounted_r = np.zeros_like(r)\n",
    "    running_add = 0\n",
    "    for t in reversed(xrange(0, r.size)):\n",
    "        running_add = running_add * gamma + r[t]\n",
    "        discounted_r[t] = running_add\n",
    "    return discounted_r\n",
    "\n",
    "\n",
    "# This function uses our model to produce a new state when given a previous state and action\n",
    "def stepModel(sess, xs, action):\n",
    "    toFeed = np.reshape(np.hstack([xs[-1][0],np.array(action)]),[1,5])\n",
    "    myPredict = sess.run([predicted_state],feed_dict={previous_state: toFeed})\n",
    "    reward = myPredict[0][:,4]\n",
    "    observation = myPredict[0][:,0:4]\n",
    "    observation[:,0] = np.clip(observation[:,0],-2.4,2.4)\n",
    "    observation[:,2] = np.clip(observation[:,2],-0.4,0.4)\n",
    "    doneP = np.clip(myPredict[0][:,5],0,1)\n",
    "    if doneP > 0.1 or len(xs)>= 300:\n",
    "        done = True\n",
    "    else:\n",
    "        done = False\n",
    "    return observation, reward, done"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "## Training the Policy and Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true,
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "xs,drs,ys,ds = [],[],[],[]\n",
    "running_reward = None\n",
    "reward_sum = 0\n",
    "episode_number = 1\n",
    "real_episodes = 1\n",
    "init = tf.global_variables_initializer()\n",
    "batch_size = real_bs\n",
    "\n",
    "drawFromModel = False # When set to True, will use model for observations\n",
    "trainTheModel = True # Whether to train the model\n",
    "trainThePolicy = False # Whether to train the policy\n",
    "switch_point = 1\n",
    "\n",
    "# Launch the graph\n",
    "with tf.Session() as sess:\n",
    "    rendering = False\n",
    "    sess.run(init)\n",
    "    observation = env.reset()\n",
    "    x = observation\n",
    "    gradBuffer = sess.run(tvars)\n",
    "    gradBuffer = resetGradBuffer(gradBuffer)\n",
    "    \n",
    "    while episode_number <= 5000:\n",
    "        # Start displaying environment once performance is acceptably high.\n",
    "        if (reward_sum/batch_size > 150 and drawFromModel == False) or rendering == True : \n",
    "            env.render()\n",
    "            rendering = True\n",
    "            \n",
    "        x = np.reshape(observation,[1,4])\n",
    "\n",
    "        tfprob = sess.run(probability,feed_dict={observations: x})\n",
    "        action = 1 if np.random.uniform() < tfprob else 0\n",
    "\n",
    "        # record various intermediates (needed later for backprop)\n",
    "        xs.append(x) \n",
    "        y = 1 if action == 0 else 0 \n",
    "        ys.append(y)\n",
    "        \n",
    "        # step the  model or real environment and get new measurements\n",
    "        if drawFromModel == False:\n",
    "            observation, reward, done, info = env.step(action)\n",
    "        else:\n",
    "            observation, reward, done = stepModel(sess,xs,action)\n",
    "                \n",
    "        reward_sum += reward\n",
    "        \n",
    "        ds.append(done*1)\n",
    "        drs.append(reward) # record reward (has to be done after we call step() to get reward for previous action)\n",
    "\n",
    "        if done: \n",
    "            \n",
    "            if drawFromModel == False: \n",
    "                real_episodes += 1\n",
    "            episode_number += 1\n",
    "\n",
    "            # stack together all inputs, hidden states, action gradients, and rewards for this episode\n",
    "            epx = np.vstack(xs)\n",
    "            epy = np.vstack(ys)\n",
    "            epr = np.vstack(drs)\n",
    "            epd = np.vstack(ds)\n",
    "            xs,drs,ys,ds = [],[],[],[] # reset array memory\n",
    "            \n",
    "            if trainTheModel == True:\n",
    "                actions = np.array([np.abs(y-1) for y in epy][:-1])\n",
    "                state_prevs = epx[:-1,:]\n",
    "                state_prevs = np.hstack([state_prevs,actions])\n",
    "                state_nexts = epx[1:,:]\n",
    "                rewards = np.array(epr[1:,:])\n",
    "                dones = np.array(epd[1:,:])\n",
    "                state_nextsAll = np.hstack([state_nexts,rewards,dones])\n",
    "\n",
    "                feed_dict={previous_state: state_prevs, true_observation: state_nexts,true_done:dones,true_reward:rewards}\n",
    "                loss,pState,_ = sess.run([model_loss,predicted_state,updateModel],feed_dict)\n",
    "            if trainThePolicy == True:\n",
    "                discounted_epr = discount_rewards(epr).astype('float32')\n",
    "                discounted_epr -= np.mean(discounted_epr)\n",
    "                discounted_epr /= np.std(discounted_epr)\n",
    "                tGrad = sess.run(newGrads,feed_dict={observations: epx, input_y: epy, advantages: discounted_epr})\n",
    "                \n",
    "                # If gradients becom too large, end training process\n",
    "                if np.sum(tGrad[0] == tGrad[0]) == 0:\n",
    "                    break\n",
    "                for ix,grad in enumerate(tGrad):\n",
    "                    gradBuffer[ix] += grad\n",
    "                \n",
    "            if switch_point + batch_size == episode_number: \n",
    "                switch_point = episode_number\n",
    "                if trainThePolicy == True:\n",
    "                    sess.run(updateGrads,feed_dict={W1Grad: gradBuffer[0],W2Grad:gradBuffer[1]})\n",
    "                    gradBuffer = resetGradBuffer(gradBuffer)\n",
    "\n",
    "                running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01\n",
    "                if drawFromModel == False:\n",
    "                    print('World Perf: Episode %f. Reward %f. action: %f. mean reward %f.' % (real_episodes,reward_sum/real_bs,action, running_reward/real_bs))\n",
    "                    if reward_sum/batch_size > 200:\n",
    "                        break\n",
    "                reward_sum = 0\n",
    "\n",
    "                # Once the model has been trained on 100 episodes, we start alternating between training the policy\n",
    "                # from the model and training the model from the real environment.\n",
    "                if episode_number > 100:\n",
    "                    drawFromModel = not drawFromModel\n",
    "                    trainTheModel = not trainTheModel\n",
    "                    trainThePolicy = not trainThePolicy\n",
    "            \n",
    "            if drawFromModel == True:\n",
    "                observation = np.random.uniform(-0.1,0.1,[4]) # Generate reasonable starting point\n",
    "                batch_size = model_bs\n",
    "            else:\n",
    "                observation = env.reset()\n",
    "                batch_size = real_bs\n",
    "                \n",
    "print(real_episodes)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "### Checking model representation\n",
    "Here we can examine how well the model is able to approximate the true environment after training. The green line indicates the real environment, and the blue indicates model predictions."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "plt.figure(figsize=(8, 12))\n",
    "for i in range(6):\n",
    "    plt.subplot(6, 2, 2*i + 1)\n",
    "    plt.plot(pState[:,i])\n",
    "    plt.subplot(6,2,2*i+1)\n",
    "    plt.plot(state_nextsAll[:,i])\n",
    "plt.tight_layout()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
